# -*- coding: utf-8 -*-
"""
Created on Fri Oct 16 09:32:39 2020

@author: Administrator
"""
import numpy as np
import pandas as pd
from pylab import *
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split#切割数据集
from sklearn.linear_model import LinearRegression #线性回归
from sklearn.neighbors import KNeighborsRegressor #K近邻回归
from sklearn.ensemble import RandomForestRegressor #随机森林回归
import lightgbm as lgb # LGB算法
from sklearn.metrics import mean_squared_error #mse评价指标
from sklearn.model_selection import KFold#交叉验证
from sklearn.model_selection import cross_val_score#交叉验证的分数
#1.读取数据
train_data = pd.read_csv(r'C:\Users\Administrator\Desktop\工业蒸汽量预测\zhengqi_train.txt',sep='\t',encoding='utf-8')
test_data = pd.read_csv(r'C:\Users\Administrator\Desktop\工业蒸汽量预测\zhengqi_test.txt',sep='\t',encoding='utf-8')
column = train_data.columns.tolist()[:39] #列表头

#2.查看特征变量的相关性
train_corr = train_data.corr()#计算相关系数矩阵
fig = plt.subplots(figsize=(120,100),dpi=75)
fig = sns.heatmap(train_corr, vmax=.8, square=True,annot=True)

#3.KDE分布图
'''
通过绘制KDE分布图可以查看并对比训练集和测试集中特征变量的分布情况，发现两个数据集中分布不一致的特征变量
分布不一致会使模型泛化能力变差需要删除
'''
fig = plt.figure(figsize=(120,100),dpi=75)#指定绘图对象的高度和宽度
for i in range(38):
    plt.subplot(8,5,i+1)
    res = sns.kdeplot(train_data[column[i]],color='Red',shade=True)
    res = sns.kdeplot(test_data[column[i]],color='Blue',shade=True)
    res.set_xlabel(column[i])
    res.set_ylabel("Frequency")
    res = res.legend(["train","test"])
    i+=1
plt.show()
#对比后发现：特征变量V5、V9、V11、V17、V22、V28训练集和测试集分布不一致
drop_col =6
drop_row=1
plt.figure(figsize=(5*drop_col,5*drop_row))
for i,col in enumerate(["V5","V9","V11","V17","V22","V28"]):
    ax = plt.subplot(drop_row,drop_col,i+1)
    ax = sns.kdeplot(train_data[col],color='Red',shade=True)
    ax = sns.kdeplot(test_data[col],color='Blue',shade=True)
    ax.set_xlabel(col)
    ax.set_ylabel("Fequency")
    ax = ax.legend(["train","test"])    
plt.show()
#在训练集和测试集中分布差异较大，会影响模型的泛化能力，故删除这些特征 
train_data = train_data.drop(['V5','V9','V11','V17','V22','V28'],axis=1)
test_data = test_data.drop(['V5','V9','V11','V17','V22','V28'],axis=1)

#3.对数据进行归一化处理
features_columns = [col for col in train_data.columns if col not in ['target']]
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler = min_max_scaler.fit(train_data[features_columns])
train_data_scaler = min_max_scaler.transform(train_data[features_columns])
test_data_scaler = min_max_scaler.transform((train_data[features_columns]))
train_data_scaler = pd.DataFrame(train_data_scaler)
train_data_scaler.columns = features_columns
test_data_scaler = pd.DataFrame(test_data_scaler)
test_data_scaler.columns = features_columns
train_data_scaler['target']=train_data['target']
column = train_data_scaler.columns.tolist()[:33] #列表头

#5.使用PCA去除共线性
pca = PCA(n_components=0.9)
new_train_pca_90 = pca.fit_transform(train_data_scaler.iloc[:,0:-1])
new_test_pca_90 = pca.transform(test_data_scaler)
new_train_pca_90 = pd.DataFrame(new_train_pca_90)
new_test_pca_90 = pd.DataFrame(new_test_pca_90)
new_train_pca_90['target']=train_data_scaler['target']
new_train_pca_90.describe()

#6.切分数据集
train_pca = new_train_pca_90.fillna(0) #采用PCA保留16维特征数据
train = new_train_pca_90[new_test_pca_90.columns]
target = new_train_pca_90['target']
#切分数据集 训练数据80% 验证数据20%
train_data,test_data,train_target,test_target = train_test_split(
    train,target,test_size=0.2,random_state=25)

#7.模型的训练
#多元线性回归
clf_LR = LinearRegression()
clf_LR.fit(train_data,train_target)
score = mean_squared_error(test_target, clf_LR.predict(test_data))
print("线性回归预测MSE为:",score)

#k近邻回归
clf_KN = KNeighborsRegressor(n_neighbors=8)#最近是8个
clf_KN.fit(train_data,train_target)
score = mean_squared_error(test_target, clf_KN.predict(test_data))
print("k近邻回归预测MSE为:",score)

#随机森林回归
clf_RF = RandomForestRegressor(n_estimators=200) #200棵树
clf_RF.fit(train_data,train_target)
score = mean_squared_error(test_target, clf_RF.predict(test_data))
print("随机森林预测MSE为:",score)

#LGB模型回归
clf_LGB = lgb.LGBMRegressor(
    learning_rate=0.01,
    max_depth=-1,
    n_estimators=5000,
    boosting_type='gbdt',
    random_state=2019,
    objective='regression')
clf_LGB.fit(train_data,train_target)
score = mean_squared_error(test_target, clf_LGB.predict(test_data))
print("LGB回归预测MSE为:",score)

#模型交叉验证
def Kfold(clf,model_name):
    kf =KFold(n_splits=5)   
    for k,(train_index,test_index) in enumerate(kf.split(train)):
        train_data,test_data,train_target,test_target = train.values[train_index],train.values[test_index],target[train_index],target[test_index]
        clf.fit(train_data,train_target)
        score_train = mean_squared_error(train_target, clf.predict(train_data))
        score_test = mean_squared_error(test_target, clf.predict(test_data))
        print(k,",折",model_name,"回归预测训练集MSE为:",score_train)
        print(k,",折",model_name,"回归预测测试集MSE为:",score_test) 
#多元线性回归五折交叉验证
Kfold(clf_LR,"多元线性");
#k近邻回归预测
Kfold(clf_KN,"k近邻");
#LGB五折交叉验证
Kfold(clf_LGB,"LGB");
#随机森林五折交叉验证
Kfold(clf_RF,"随机森林");






  




